What are the primary reasons to use PCA?
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
data(stop_words)
wine <- read_rds("../resources/variety-project.rds") %>% rowid_to_column("id")
library(tidytext)
data(stop_words)
wine <- wine %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
filter(!(word %in% c("drink","vineyard","variety","price","points","wine","pinot","chardonnay","gris","noir","riesling","syrah"))) %>%
count(id, word) %>%
group_by(word) %>%
mutate(total = sum(n)) %>%
filter(total > 500) %>%
ungroup %>%
group_by(id) %>%
mutate(exists = if_else(n>0,1,0)) %>%
ungroup %>%
pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = c(exists=0)) %>%
right_join(wine, by="id") %>%
replace(.,is.na(.),0) %>%
mutate(log_price = log(price)) %>%
select(-id, -price, -description)
names(wine)
## [1] "flavors" "tart" "oak" "finish" "fruit"
## [6] "tannins" "cherry" "light" "variety" "points"
## [11] "log_price"
pr_wine <- prcomp(x = select(wine,-variety), scale = T, center = T)
summary(pr_wine)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.3215 1.1071 1.0345 1.0073 0.98302 0.97868 0.96368
## Proportion of Variance 0.1746 0.1226 0.1070 0.1015 0.09663 0.09578 0.09287
## Cumulative Proportion 0.1746 0.2972 0.4042 0.5057 0.60231 0.69810 0.79096
## PC8 PC9 PC10
## Standard deviation 0.9375 0.88299 0.65704
## Proportion of Variance 0.0879 0.07797 0.04317
## Cumulative Proportion 0.8789 0.95683 1.00000
screeplot(pr_wine, type = "lines")
biplot(pr_wine)
biplot(pr_wine, choices = c(3,4))
pr_wine$rotation
## PC1 PC2 PC3 PC4 PC5
## flavors -0.09586279 0.389440836 -0.43248272 0.14235301 0.32198847
## tart -0.15495828 -0.381109570 0.25754936 0.35333777 -0.23613301
## oak 0.10988655 0.048765487 0.56910333 -0.35368385 0.61677902
## finish 0.05538319 -0.031176652 -0.38160699 -0.72190787 -0.31458193
## fruit 0.14835040 -0.514950105 0.16337150 -0.24647009 -0.11100017
## tannins 0.11507476 -0.422731509 -0.34175797 0.31485067 0.20099267
## cherry 0.37745872 -0.299556237 -0.21965954 -0.06794250 0.11769187
## light -0.22191639 -0.321285160 -0.28682880 -0.11415138 0.52598416
## points 0.58875678 0.250617819 0.03265683 0.09802766 -0.08867767
## log_price 0.61485717 -0.009917253 -0.04525738 0.14666361 0.10046362
## PC6 PC7 PC8 PC9 PC10
## flavors 0.2943509 -0.37547201 -0.53999015 0.09411969 -0.009713721
## tart -0.3547888 -0.52638039 -0.36184664 -0.20054482 -0.098502625
## oak -0.1417953 0.08153352 -0.32291222 -0.14125713 -0.071973147
## finish -0.3761705 -0.04822750 -0.28703106 -0.06317100 -0.004062862
## fruit 0.5314431 -0.14102233 -0.17992002 0.52757732 -0.005623983
## tannins -0.1936070 0.59748865 -0.35232888 0.08309068 -0.161902215
## cherry 0.3748396 -0.14922759 0.13142585 -0.71143854 -0.121991476
## light -0.2772870 -0.34987118 0.45610413 0.23509155 -0.113892089
## points -0.1678861 -0.15427722 0.09424389 0.24896518 -0.670196814
## log_price -0.2457200 -0.16616464 0.01122554 0.13542793 0.694078656
head(pr_wine$x)
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] -2.51789057 0.4040814 0.41214587 1.5356567 -0.6488525 -0.35115520
## [2,] -0.05329592 0.6776983 0.41168677 0.6827924 -0.3233852 -0.66438711
## [3,] -1.59677498 -0.2485021 2.81897793 0.3752514 0.4786475 -1.50262229
## [4,] -0.15575115 -0.5264200 -0.09364021 -1.5356814 -1.2681434 -0.23305233
## [5,] -1.41723444 0.1552330 -1.36797242 1.5417984 0.7288760 -0.06114332
## [6,] -1.60016306 1.3850886 -0.38293175 0.6603520 0.1678791 0.44228922
## PC7 PC8 PC9 PC10
## [1,] -1.0781328 -1.16506405 -1.0749213 -0.656687443
## [2,] 0.6757479 0.97921893 -0.2898472 1.710802572
## [3,] -0.2156086 -0.96692838 -1.5598540 -0.355556765
## [4,] 0.4221780 -0.06581793 0.4839961 1.581293171
## [5,] 2.0706983 -1.18788571 -0.2345163 0.004556595
## [6,] 0.2877224 -0.15801766 -0.4431403 0.648703584
prc <- bind_cols(select(wine,variety),as.data.frame(pr_wine$x)) %>%
select(1:5) %>%
rename("quality" = PC1) %>%
rename("flavor" = PC2) %>%
rename("oak" = PC3) %>%
rename("tart" = PC4)
head(prc)
| variety | quality | flavor | oak | tart |
|---|---|---|---|---|
| Pinot_Gris | -2.5178906 | 0.4040814 | 0.4121459 | 1.5356567 |
| Pinot_Noir | -0.0532959 | 0.6776983 | 0.4116868 | 0.6827924 |
| Pinot_Noir | -1.5967750 | -0.2485021 | 2.8189779 | 0.3752514 |
| Pinot_Noir | -0.1557511 | -0.5264200 | -0.0936402 | -1.5356814 |
| Pinot_Noir | -1.4172344 | 0.1552330 | -1.3679724 | 1.5417984 |
| Pinot_Noir | -1.6001631 | 1.3850886 | -0.3829317 | 0.6603520 |
prc %>%
select(variety, quality, flavor) %>%
pivot_longer(cols = -variety,names_to = "component",values_to = "loading") %>%
ggplot(aes(loading, fill=variety))+
geom_density(alpha=0.5)+
facet_grid(.~component)
prc %>%
select(variety, oak, tart) %>%
pivot_longer(cols = -variety,names_to = "component",values_to = "loading") %>%
ggplot(aes(loading, fill=variety))+
geom_density(alpha=0.5)+
facet_grid(.~component)
wine <- read_rds("../resources/variety-project.rds") %>% rowid_to_column("id")
wine <- wine %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
filter(!(word %in% c("drink","vineyard","variety","price","points","wine","pinot","chardonnay","gris","noir","riesling","syrah"))) %>%
count(id, word) %>%
group_by(word) %>%
mutate(total = sum(n)) %>%
filter(total > 100) %>%
ungroup %>%
group_by(id) %>%
mutate(exists = if_else(n>0,1,0)) %>%
ungroup %>%
pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = c(exists=0)) %>%
right_join(wine, by="id") %>%
replace(.,is.na(.),0) %>%
mutate(log_price = log(price)) %>%
select(-id, -price, -description)
names(wine)
## [1] "acidity" "crisp" "fermented" "flavors"
## [5] "green" "pineapple" "stainless" "tart"
## [9] "bottling" "earthy" "herbal" "tannic"
## [13] "aged" "berry" "chocolate" "herb"
## [17] "hint" "mix" "months" "oak"
## [21] "oregon" "stem" "finish" "flavor"
## [25] "fruit" "leaf" "notes" "sweet"
## [29] "touch" "brings" "tannins" "fruity"
## [33] "lightly" "pleasant" "simple" "strawberry"
## [37] "acids" "cherry" "forward" "lingering"
## [41] "ripe" "textural" "acid" "aromas"
## [45] "cranberry" "light" "barrels" "spicy"
## [49] "age" "bit" "bottle" "offers"
## [53] "sharp" "tight" "black" "candy"
## [57] "cellar" "depth" "grapes" "grown"
## [61] "red" "smooth" "supple" "aging"
## [65] "clean" "concentrated" "dark" "earth"
## [69] "fine" "balance" "berries" "complex"
## [73] "elegant" "estate" "peach" "selection"
## [77] "streak" "tea" "vintage" "whiff"
## [81] "wild" "wines" "fruits" "palate"
## [85] "2020" "character" "delicious" "juicy"
## [89] "bright" "carries" "orange" "tangy"
## [93] "citrus" "mouthfeel" "reserve" "spent"
## [97] "time" "balanced" "blend" "coffee"
## [101] "alcohol" "plenty" "power" "slightly"
## [105] "ava" "sourced" "stone" "vines"
## [109] "hints" "length" "midpalate" "peel"
## [113] "rose" "subtle" "caramel" "cola"
## [117] "deep" "streaks" "barrel" "cuvée"
## [121] "dried" "mouth" "concentration" "rich"
## [125] "round" "winery's" "complexity" "excellent"
## [129] "pinots" "valley" "grapefruit" "minerality"
## [133] "single" "style" "note" "dry"
## [137] "pear" "fresh" "cherries" "lovely"
## [141] "pepper" "lemon" "melon" "apple"
## [145] "cinnamon" "pretty" "french" "spice"
## [149] "vanilla" "drinking" "soft" "highlights"
## [153] "plum" "texture" "clone" "effort"
## [157] "focused" "quickly" "loaded" "aromatic"
## [161] "color" "delicate" "scents" "toast"
## [165] "mineral" "lively" "white" "toasty"
## [169] "baking" "spices" "lush" "thin"
## [173] "bitter" "bodied" "medium" "firm"
## [177] "nicely" "vineyards" "sugar" "raspberry"
## [181] "extra" "willamette" "pleasing" "nose"
## [185] "core" "substantial" "variety" "points"
## [189] "log_price"
pr_wine <- prcomp(x = select(wine,-variety), scale = T, center = T)
screeplot(pr_wine, type = "lines")
rownames_to_column(as.data.frame(pr_wine$rotation)) %>%
select(1:5) %>%
filter(abs(PC1) >= 0.25 | abs(PC2) >= 0.25 | abs(PC3) >= 0.25 | abs(PC4) >= 0.25)
| rowname | PC1 | PC2 | PC3 | PC4 |
|---|---|---|---|---|
| months | 0.1116653 | 0.2642817 | 0.2957780 | -0.0009590 |
| oak | 0.0913025 | 0.3178890 | 0.3133270 | 0.0000240 |
| valley | 0.0461676 | 0.0195822 | -0.0093578 | 0.2949542 |
| french | 0.1105282 | 0.2947821 | 0.2935521 | -0.0207073 |
| willamette | 0.0474642 | 0.0211232 | -0.0065160 | 0.2918651 |
| points | 0.2060537 | 0.2563044 | -0.3258908 | 0.0331730 |
| log_price | 0.3548499 | 0.0535944 | -0.0912909 | 0.0433292 |
prc <- bind_cols(select(wine,variety),as.data.frame(pr_wine$x)) %>%
select(1:5) %>%
rename("pricey"=PC1, "quality_french_oak"=PC2, "crappy_french_oak"=PC3, "willamette_valley"=PC4)
prc %>%
select(variety, pricey,willamette_valley) %>%
pivot_longer(cols = -variety,names_to = "component",values_to = "loading") %>%
ggplot(aes(loading, fill=variety))+
geom_density(alpha=0.5)+
facet_grid(.~component)
prc %>%
select(variety, quality_french_oak, crappy_french_oak) %>%
pivot_longer(cols = -variety,names_to = "component",values_to = "loading") %>%
ggplot(aes(loading, fill=variety))+
geom_density(alpha=0.5)+
facet_grid(.~component)
library(caret)
fit <- train(variety ~ .,
data = prc,
method = "naive_bayes",
metric = "Kappa",
trControl = trainControl(method = "cv"))
confusionMatrix(predict(fit, prc),factor(prc$variety))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Chardonnay Pinot_Gris Pinot_Noir Riesling Syrah
## Chardonnay 161 47 22 10 3
## Pinot_Gris 116 270 29 105 1
## Pinot_Noir 179 72 2681 39 168
## Riesling 26 49 5 70 0
## Syrah 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7851
## 95% CI : (0.7721, 0.7977)
## No Information Rate : 0.6753
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5266
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Chardonnay Class: Pinot_Gris Class: Pinot_Noir
## Sensitivity 0.33402 0.61644 0.9795
## Specificity 0.97704 0.93057 0.6520
## Pos Pred Value 0.66255 0.51823 0.8541
## Neg Pred Value 0.91575 0.95243 0.9387
## Prevalence 0.11892 0.10807 0.6753
## Detection Rate 0.03972 0.06662 0.6615
## Detection Prevalence 0.05996 0.12855 0.7745
## Balanced Accuracy 0.65553 0.77350 0.8158
## Class: Riesling Class: Syrah
## Sensitivity 0.31250 0.00000
## Specificity 0.97911 1.00000
## Pos Pred Value 0.46667 NaN
## Neg Pred Value 0.96054 0.95756
## Prevalence 0.05527 0.04244
## Detection Rate 0.01727 0.00000
## Detection Prevalence 0.03701 0.00000
## Balanced Accuracy 0.64580 0.50000
…Dang!!!
Latent Dirichlet allocation is one of the most common algorithms for topic modeling. Without diving into the math behind the model, we can understand it as being guided by two principles.
Every document is a mixture of topics. We imagine that each document may contain words from several topics in particular proportions. For example, in a two-topic model we could say “Document 1 is 90% topic A and 10% topic B, while Document 2 is 30% topic A and 70% topic B.”
Every topic is a mixture of words. For example, we could imagine a two-topic model of American news, with one topic for “politics” and one for “entertainment.” The most common words in the politics topic might be “President”, “Congress”, and “government”, while the entertainment topic may be made up of words such as “movies”, “television”, and “actor”. Importantly, words can be shared between topics; a word like “budget” might appear in both equally.
See: https://www.tidytextmining.com/topicmodeling.html#latent-dirichlet-allocation
LDA is a mathematical method for estimating both of these at the same time: finding the mixture of words that is associated with each topic, while also determining the mixture of topics that describes each document.
library(topicmodels)
wine_dtm <- read_rds("../resources/variety-project.rds") %>% rowid_to_column("id") %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
filter(!(word %in% c("drink","vineyard","variety","price","points","wine","pinot","chardonnay","gris","noir","riesling","syrah"))) %>%
count(id,word) %>%
cast_dtm(id, word, n)
wine_lda <- LDA(wine_dtm, k = 4, control = list(seed = 5004))
wine_lda
## A LDA_VEM topic model with 4 topics.
topics <- tidy(wine_lda, matrix = "beta")
head(topics)
| topic | term | beta |
|---|---|---|
| 1 | acidity | 0.0029667 |
| 2 | acidity | 0.0042483 |
| 3 | acidity | 0.0040735 |
| 4 | acidity | 0.0067402 |
| 1 | crisp | 0.0000277 |
| 2 | crisp | 0.0003778 |
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
topics <- tidy(wine_lda, matrix = "gamma")
head(topics)
| document | topic | gamma |
|---|---|---|
| 1 | 1 | 0.2510006 |
| 2 | 1 | 0.2499304 |
| 3 | 1 | 0.2480522 |
| 4 | 1 | 0.2425154 |
| 5 | 1 | 0.2518187 |
| 6 | 1 | 0.2563809 |
wine <- read_rds("../resources/variety-project.rds") %>% rowid_to_column("id")
topics <- topics %>%
pivot_wider(id_cols = document,names_from = topic,values_from = gamma, names_prefix = "topic_") %>%
mutate(id=as.integer(document)) %>%
left_join(select(wine, id, variety)) %>%
select(-document, -id)
head(topics)
| topic_1 | topic_2 | topic_3 | topic_4 | variety |
|---|---|---|---|---|
| 0.2510006 | 0.2446511 | 0.2519079 | 0.2524405 | Pinot_Gris |
| 0.2499304 | 0.2480786 | 0.2499077 | 0.2520833 | Pinot_Noir |
| 0.2480522 | 0.2541088 | 0.2467133 | 0.2511257 | Pinot_Noir |
| 0.2425154 | 0.2499155 | 0.2567310 | 0.2508381 | Pinot_Noir |
| 0.2518187 | 0.2469421 | 0.2527943 | 0.2484449 | Pinot_Noir |
| 0.2563809 | 0.2480390 | 0.2455539 | 0.2500262 | Pinot_Noir |
fit <- train(variety ~ .,
data = topics,
method = "naive_bayes",
metric = "Kappa",
trControl = trainControl(method = "cv"))
confusionMatrix(predict(fit, topics),factor(topics$variety))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Chardonnay Pinot_Gris Pinot_Noir Riesling Syrah
## Chardonnay 83 31 86 6 3
## Pinot_Gris 0 1 2 0 0
## Pinot_Noir 399 406 2649 217 167
## Riesling 0 0 0 1 0
## Syrah 0 0 0 0 2
##
## Overall Statistics
##
## Accuracy : 0.6751
## 95% CI : (0.6604, 0.6895)
## No Information Rate : 0.6753
## P-Value [Acc > NIR] : 0.5208
##
## Kappa : 0.0828
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Chardonnay Class: Pinot_Gris Class: Pinot_Noir
## Sensitivity 0.17220 0.0022831 0.9678
## Specificity 0.96472 0.9994467 0.0965
## Pos Pred Value 0.39713 0.3333333 0.6902
## Neg Pred Value 0.89620 0.8920988 0.5907
## Prevalence 0.11892 0.1080681 0.6753
## Detection Rate 0.02048 0.0002467 0.6536
## Detection Prevalence 0.05157 0.0007402 0.9470
## Balanced Accuracy 0.56846 0.5008649 0.5322
## Class: Riesling Class: Syrah
## Sensitivity 0.0044643 0.0116279
## Specificity 1.0000000 1.0000000
## Pos Pred Value 1.0000000 1.0000000
## Neg Pred Value 0.9449654 0.9580351
## Prevalence 0.0552677 0.0424377
## Detection Rate 0.0002467 0.0004935
## Detection Prevalence 0.0002467 0.0004935
## Balanced Accuracy 0.5022321 0.5058140
wine_lda <- LDA(wine_dtm, k = 20, control = list(seed = 5004))
topics <- tidy(wine_lda, matrix = "gamma") %>%
pivot_wider(id_cols = document,names_from = topic,values_from = gamma, names_prefix = "topic_") %>%
mutate(id=as.integer(document)) %>%
left_join(select(wine, id, variety)) %>%
select(-document, -id)
fit <- train(variety ~ .,
data = topics,
method = "naive_bayes",
metric = "Kappa",
trControl = trainControl(method = "cv"))
confusionMatrix(predict(fit, topics),factor(topics$variety))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Chardonnay Pinot_Gris Pinot_Noir Riesling Syrah
## Chardonnay 164 26 55 13 3
## Pinot_Gris 72 212 93 51 5
## Pinot_Noir 232 183 2549 89 133
## Riesling 14 16 36 69 1
## Syrah 0 1 4 2 30
##
## Overall Statistics
##
## Accuracy : 0.7461
## 95% CI : (0.7324, 0.7595)
## No Information Rate : 0.6753
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4329
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Chardonnay Class: Pinot_Gris Class: Pinot_Noir
## Sensitivity 0.34025 0.48402 0.9313
## Specificity 0.97284 0.93887 0.5160
## Pos Pred Value 0.62835 0.48961 0.8001
## Neg Pred Value 0.91614 0.93757 0.7832
## Prevalence 0.11892 0.10807 0.6753
## Detection Rate 0.04046 0.05231 0.6289
## Detection Prevalence 0.06440 0.10683 0.7861
## Balanced Accuracy 0.65654 0.71144 0.7236
## Class: Riesling Class: Syrah
## Sensitivity 0.30804 0.174419
## Specificity 0.98250 0.998196
## Pos Pred Value 0.50735 0.810811
## Neg Pred Value 0.96043 0.964641
## Prevalence 0.05527 0.042438
## Detection Rate 0.01702 0.007402
## Detection Prevalence 0.03356 0.009129
## Balanced Accuracy 0.64527 0.586307
topics <- tidy(wine_lda, matrix = "gamma") %>%
pivot_wider(id_cols = document,names_from = topic,values_from = gamma, names_prefix = "topic_") %>%
mutate(id=as.integer(document)) %>%
left_join(wine) %>%
mutate(log_price = log(price)) %>%
select(-document, -id, -description, -price)
fit <- train(variety ~ .,
data = topics,
method = "naive_bayes",
metric = "Kappa",
trControl = trainControl(method = "cv"))
confusionMatrix(predict(fit, topics),factor(topics$variety))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Chardonnay Pinot_Gris Pinot_Noir Riesling Syrah
## Chardonnay 176 10 47 12 3
## Pinot_Gris 81 352 98 73 8
## Pinot_Noir 214 55 2572 48 126
## Riesling 11 21 15 90 1
## Syrah 0 0 5 1 34
##
## Overall Statistics
##
## Accuracy : 0.7955
## 95% CI : (0.7827, 0.8078)
## No Information Rate : 0.6753
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5664
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Chardonnay Class: Pinot_Gris Class: Pinot_Noir
## Sensitivity 0.36515 0.80365 0.9397
## Specificity 0.97984 0.92808 0.6634
## Pos Pred Value 0.70968 0.57516 0.8531
## Neg Pred Value 0.91958 0.97501 0.8410
## Prevalence 0.11892 0.10807 0.6753
## Detection Rate 0.04342 0.08685 0.6346
## Detection Prevalence 0.06119 0.15100 0.7439
## Balanced Accuracy 0.67249 0.86587 0.8015
## Class: Riesling Class: Syrah
## Sensitivity 0.40179 0.197674
## Specificity 0.98746 0.998454
## Pos Pred Value 0.65217 0.850000
## Neg Pred Value 0.96577 0.965612
## Prevalence 0.05527 0.042438
## Detection Rate 0.02221 0.008389
## Detection Prevalence 0.03405 0.009869
## Balanced Accuracy 0.69462 0.598064